PM566 HW-1

Author

Vicki

library(leaflet)
library(ggplot2)

Step 1

PM_2002 <- read.csv("~/Downloads/ad_viz_plotval_data (1).csv")

PM_2022 <- read.csv("~/Downloads/ad_viz_plotval_data.csv")
#Check dimension
dim(PM_2002)
[1] 15976    22
#Check headers and footers
head(PM_2002)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              81       Livermore               1              100
2              93       Livermore               1              100
3              74       Livermore               1              100
4              82       Livermore               1              100
5              98       Livermore               1              100
6             115       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         120
2              88101  PM2.5 - Local Conditions         120
3              88101  PM2.5 - Local Conditions         120
4              88101  PM2.5 - Local Conditions         120
5              88101  PM2.5 - Local Conditions         120
6              88101  PM2.5 - Local Conditions         120
                     Method.Description CBSA.Code
1 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(PM_2002)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
15971 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
15972 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
15973 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
15974 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
15975 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
15976 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
15971              62 Woodland-Gibson Road               1              100
15972              62 Woodland-Gibson Road               1              100
15973               6 Woodland-Gibson Road               1              100
15974              77 Woodland-Gibson Road               1              100
15975              28 Woodland-Gibson Road               1              100
15976              33 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
15971              88101  PM2.5 - Local Conditions         117
15972              88101  PM2.5 - Local Conditions         117
15973              88101  PM2.5 - Local Conditions         117
15974              88101  PM2.5 - Local Conditions         117
15975              88101  PM2.5 - Local Conditions         117
15976              88101  PM2.5 - Local Conditions         117
                         Method.Description CBSA.Code
15971 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15972 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15973 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15974 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15975 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15976 R & P Model 2000 PM2.5 Sampler w/WINS     40900
                                    CBSA.Name State.FIPS.Code      State
15971 Sacramento--Roseville--Arden-Arcade, CA               6 California
15972 Sacramento--Roseville--Arden-Arcade, CA               6 California
15973 Sacramento--Roseville--Arden-Arcade, CA               6 California
15974 Sacramento--Roseville--Arden-Arcade, CA               6 California
15975 Sacramento--Roseville--Arden-Arcade, CA               6 California
15976 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
15971              113   Yolo      38.66121      -121.7327
15972              113   Yolo      38.66121      -121.7327
15973              113   Yolo      38.66121      -121.7327
15974              113   Yolo      38.66121      -121.7327
15975              113   Yolo      38.66121      -121.7327
15976              113   Yolo      38.66121      -121.7327
#Check variable names and types
names(PM_2002)
 [1] "Date"                           "Source"                        
 [3] "Site.ID"                        "POC"                           
 [5] "Daily.Mean.PM2.5.Concentration" "Units"                         
 [7] "Daily.AQI.Value"                "Local.Site.Name"               
 [9] "Daily.Obs.Count"                "Percent.Complete"              
[11] "AQS.Parameter.Code"             "AQS.Parameter.Description"     
[13] "Method.Code"                    "Method.Description"            
[15] "CBSA.Code"                      "CBSA.Name"                     
[17] "State.FIPS.Code"                "State"                         
[19] "County.FIPS.Code"               "County"                        
[21] "Site.Latitude"                  "Site.Longitude"                
str(PM_2002)
'data.frame':   15976 obs. of  22 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily.Mean.PM2.5.Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  81 93 74 82 98 115 89 62 69 107 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  120 120 120 120 120 120 120 120 120 120 ...
 $ Method.Description            : chr  "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
dim(PM_2022)
[1] 59918    22
head(PM_2022)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              58       Livermore               1              100
2              60       Livermore               1              100
3              39       Livermore               1              100
4              21       Livermore               1              100
5              23       Livermore               1              100
6              21       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         170
2              88101  PM2.5 - Local Conditions         170
3              88101  PM2.5 - Local Conditions         170
4              88101  PM2.5 - Local Conditions         170
5              88101  PM2.5 - Local Conditions         170
6              88101  PM2.5 - Local Conditions         170
                    Method.Description CBSA.Code
1 Met One BAM-1020 Mass Monitor w/VSCC     41860
2 Met One BAM-1020 Mass Monitor w/VSCC     41860
3 Met One BAM-1020 Mass Monitor w/VSCC     41860
4 Met One BAM-1020 Mass Monitor w/VSCC     41860
5 Met One BAM-1020 Mass Monitor w/VSCC     41860
6 Met One BAM-1020 Mass Monitor w/VSCC     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(PM_2022)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
59913 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
59914 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
59915 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
59916 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
59917 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
59918 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
59913              19 Woodland-Gibson Road               1              100
59914              21 Woodland-Gibson Road               1              100
59915              33 Woodland-Gibson Road               1              100
59916              99 Woodland-Gibson Road               1              100
59917              77 Woodland-Gibson Road               1              100
59918               6 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
59913              88101  PM2.5 - Local Conditions         145
59914              88101  PM2.5 - Local Conditions         145
59915              88101  PM2.5 - Local Conditions         145
59916              88101  PM2.5 - Local Conditions         145
59917              88101  PM2.5 - Local Conditions         145
59918              88101  PM2.5 - Local Conditions         145
                                         Method.Description CBSA.Code
59913 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59914 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59915 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59916 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59917 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59918 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
                                    CBSA.Name State.FIPS.Code      State
59913 Sacramento--Roseville--Arden-Arcade, CA               6 California
59914 Sacramento--Roseville--Arden-Arcade, CA               6 California
59915 Sacramento--Roseville--Arden-Arcade, CA               6 California
59916 Sacramento--Roseville--Arden-Arcade, CA               6 California
59917 Sacramento--Roseville--Arden-Arcade, CA               6 California
59918 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
59913              113   Yolo      38.66121      -121.7327
59914              113   Yolo      38.66121      -121.7327
59915              113   Yolo      38.66121      -121.7327
59916              113   Yolo      38.66121      -121.7327
59917              113   Yolo      38.66121      -121.7327
59918              113   Yolo      38.66121      -121.7327
names(PM_2022)
 [1] "Date"                           "Source"                        
 [3] "Site.ID"                        "POC"                           
 [5] "Daily.Mean.PM2.5.Concentration" "Units"                         
 [7] "Daily.AQI.Value"                "Local.Site.Name"               
 [9] "Daily.Obs.Count"                "Percent.Complete"              
[11] "AQS.Parameter.Code"             "AQS.Parameter.Description"     
[13] "Method.Code"                    "Method.Description"            
[15] "CBSA.Code"                      "CBSA.Name"                     
[17] "State.FIPS.Code"                "State"                         
[19] "County.FIPS.Code"               "County"                        
[21] "Site.Latitude"                  "Site.Longitude"                
str(PM_2022)
'data.frame':   59918 obs. of  22 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily.Mean.PM2.5.Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  58 60 39 21 23 21 13 38 59 55 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  170 170 170 170 170 170 170 170 170 170 ...
 $ Method.Description            : chr  "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
#Check distribution for 2002
hist(PM_2002$Daily.Mean.PM2.5.Concentration)

plot(density(PM_2002$Daily.Mean.PM2.5.Concentration))

#Check distribution for 2022
hist(PM_2022$Daily.Mean.PM2.5.Concentration)

plot(density(PM_2022$Daily.Mean.PM2.5.Concentration))

Summary: - Dimensions: 15976 rows and 22 columns for 2002. 59918 rows and 22 columns for 2022. - Headers & Footers: Consistent between both datasets - Variable Names and Types: Same variable names across both datasets. Daily Mean PM2.5 Concentration was collected as a continouous variable.

For the distribution of Daily Mean PM 2.5 Concentration during 2002, the histogram distribution is skewed to the right which means that the air quality is relatively good, with some occasional days with very high pollution levels for California. For 2022, the mean PM 2.5 concentration levels are more condensed, suggesting that PM 2.5 levels remained relatively consistent given the one spike in histogram and density plot.

Step 2

PM_2002$Date <- 2002
PM_2022$Date <- 2022

combined_PM <- rbind(PM_2002, PM_2022)

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
combined_PM <- combined_PM %>%
  rename(
    Year = Date,
    PM25 = Daily.Mean.PM2.5.Concentration,
    State = State,
    County = County,
    City = Local.Site.Name,
    Lat = Site.Latitude,
    Lon = Site.Longitude
  )

Step 3

year_pal <- colorFactor(palette = c("blue", "red"), domain = combined_PM$Year)

leaflet(combined_PM) %>%
  addTiles() %>%
  addCircleMarkers(~Lon, ~Lat,
                   color = ~year_pal(Year),
                   radius = 1,
                   fillOpacity = 0.6,
                   popup = ~paste("Year:", Year)) %>%
  addLegend("bottomright", pal = year_pal, values = combined_PM$Year, title = "Monitoring Sites")

From 2002 to 2002, the distribution of the monitoring sites does change. There is an increase of monitoring sites, which is evident in the increased appearance of red dots depicted on the map. Upon closer observation, in 2002, most of the monitoring sites were placed in bigger cities with only a few placed between the cities. In 2022, this distributions changes with significantly more monitoring sites located in smaller and lesser known cities as well.

Step 4

#Check for missing or implausible values of PM2.5
sum(is.na(combined_PM$PM25))
[1] 0
summary(combined_PM$PM25)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -6.70    4.40    7.60   10.04   12.20  302.50 
#Flag implausible values for PM2.5
combined_PM <- combined_PM %>%
  mutate(
    issue_flag = ifelse(is.na(PM25) | PM25 < 0 | PM25 > 500, 1, 0)
  )

#Calculate proportion of missing/implausible values
issue_summary <- combined_PM %>%
  group_by(Year) %>%
  summarise(
    total_obs = n(),
    issues = sum(issue_flag),
    prop_issues = issues / total_obs
  )

print(issue_summary)
# A tibble: 2 × 4
   Year total_obs issues prop_issues
  <dbl>     <int>  <dbl>       <dbl>
1  2002     15976      0     0      
2  2022     59918    215     0.00359

In 2002, there are no missing or implausible values of PM2.5. This suggests that there were little to no problems with the data quality for that year. By 2022, a small proportion of values (0.35%) were flagged as problematic. Although this is a very small fraction, it does suggest a shift from complete data quality to evidence of some issues in 2022. Possible explanations for this pattern change could be the increase in monitoring sites, changes in protocols, or an increased of entry errors in recent years.

Step 5

Level 1: State-Level

# Visualization
ggplot(combined_PM, aes(x = Year, y = PM25, fill = factor(Year))) + 
  geom_boxplot() + 
  labs(title = "California PM 2.5 Distribution in 2002 vs. 2022",
       x = "Year",
       y = "Daily Mean PM2.5") +
  scale_fill_manual(values = c("blue", "red"))

# Summary
state_summary <- combined_PM %>%
  group_by(Year) %>%
  summarise(
    min_pm25 = min(PM25, na.rm = TRUE),
    max_pm25 = max(PM25, na.rm = TRUE),
    mean_pm25 = mean(PM25, na.rm = TRUE),
    median_pm25 = median(PM25, na.rm = TRUE),
    sd_pm25 = sd(PM25, na.rm = TRUE),
    n_obs = n()
  )

print(state_summary)
# A tibble: 2 × 7
   Year min_pm25 max_pm25 mean_pm25 median_pm25 sd_pm25 n_obs
  <dbl>    <dbl>    <dbl>     <dbl>       <dbl>   <dbl> <int>
1  2002      0       104.     16.1         12     13.9  15976
2  2022     -6.7     302.      8.41         6.8    7.64 59918

State-level Results: At the state-level, there is an improvement distribution of PM2.5 concentration levels from 2002 to 2022. In 2002, the median is closer to the first quartile (Q1), indicating a high concentration of low PM2.5 values. The mean is higher in 2002 as well which means there are unusually high values that are pulling the mean value up. The distribution becomes more narrow in 2022, given the lower mean & median, suggesting reduced variability across monitoring sites statewide. Thus, the boxplots in 2002 compared to 2022 demonstrate an improvement of PM2.5 concentration levels over time.

Level 2: County-Level

# Visualization
ggplot(combined_PM, aes(y = reorder(County, PM25))) + 
  geom_point(aes(x = PM25, color = factor(Year)), size = 1) +
  geom_line(aes(x = PM25, group = County), color = "gray70") +
  labs(title = "PM2.5 Levels by County: 2002 vs. 2022",
       x = "Daily Mean PM2.5", y = "County", color = "Year")

# Summary
county_summary <- combined_PM %>%
  group_by(County, Year) %>%
  summarise(
    n_obs = n(),
    mean_pm25 = mean(PM25, na.rm = TRUE),
    median_pm25 = median(PM25, na.rm = TRUE),
    min_pm25 = min(PM25, na.rm = TRUE),
    max_pm25 = max(PM25, na.rm = TRUE),
    .groups = "drop")

print(county_summary)
# A tibble: 98 × 7
   County        Year n_obs mean_pm25 median_pm25 min_pm25 max_pm25
   <chr>        <dbl> <int>     <dbl>       <dbl>    <dbl>    <dbl>
 1 Alameda       2002   201     14.3         10        1.9     61.6
 2 Alameda       2022  1793      8.20         7       -0.7     35.5
 3 Butte         2002   473     14.8         11.5      1       88  
 4 Butte         2022  1121      6.19         4.5     -0.6     42.8
 5 Calaveras     2002    60      9.9          8        2       40  
 6 Calaveras     2022   355      6.04         5        0       25.9
 7 Colusa        2002    95     11.7          9        1       57  
 8 Colusa        2022   401      7.61         6.7      0.6     37  
 9 Contra Costa  2002   276     15.1          9.5      2       76.7
10 Contra Costa  2022   815      8.24         7.2      0.9     37.3
# ℹ 88 more rows

County-Level Results: The dot plot shows mean PM2.5 concentrations for each county in California in 2002 (red) and 2022 (blue). Overall, majority of the counties showed a decline in mean PM2.5 levels over the 20-year period, suggesting improvements in air quality across the state. A few counties display little changes or higher PM2.5 levels in 2022, which could be due to local emissions, wildfires, or measurement differences. The plot highlights both geographical variation and temporal trends, illustrating that while statewide improvements are evident, the magnitude of change varies between counties.

Level 3: City Level

la_data <- combined_PM %>%
  filter(County == "Los Angeles")
# Visualization
combined_PM <- combined_PM %>%
  filter(!is.na(City))

ggplot(la_data, aes(y = reorder(City, combined_PM))) +
  geom_point(aes(x = PM25, color = factor(Year)), size = 1) + 
  geom_line(aes(x = PM25, group = City), color = "gray70") + 
  labs(x = "Daily Mean PM2.5", y = "City", color = "Year",
       title = "Mean PM2.5 by City in LA County: 2002 vs. 2022")
Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...): data
length is not a multiple of split variable
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...): data
length is not a multiple of split variable
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA
Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
returning NA

# Summary
city_summary <- la_data %>%
  group_by(City, Year) %>%
  summarise(
    n_obs = n(),
    mean_pm25 = mean(PM25, na.rm = TRUE),
    median_pm25 = median(PM25, na.rm = TRUE),
    min_pm25 = min(PM25, na.rm = TRUE),
    max_pm25 = max(PM25, na.rm = TRUE),
    .groups = "drop"
  )

print(city_summary)
# A tibble: 25 × 7
   City                       Year n_obs mean_pm25 median_pm25 min_pm25 max_pm25
   <chr>                     <dbl> <int>     <dbl>       <dbl>    <dbl>    <dbl>
 1 ""                         2002   118     23.9        21.4       5.6     61  
 2 "Azusa"                    2002   339     20.8        18.7       3.1     72.4
 3 "Azusa"                    2022    76      9.72        9.65      3.1     18.4
 4 "Burbank"                  2002   122     24.0        21.6       3.5     63  
 5 "Compton"                  2022   723     13.0        11.9       2.6     54.6
 6 "Glendora"                 2022   365      8.42        7.8      -0.8     56  
 7 "Lancaster-Division Stre…  2002   107     10.4        10         1       24  
 8 "Lancaster-Division Stre…  2022   348      7.52        7.3       1.9     15.1
 9 "Lebec"                    2002   109      4.82        4.8       0.6     12.4
10 "Lebec"                    2022    41      3.50        3.4       0.9      7.3
# ℹ 15 more rows
top5_cities <- la_data %>%
  group_by(City, Year) %>%
  summarise(mean_pm25 = mean(PM25, na.rm = TRUE)) %>%
  arrange(desc(mean_pm25)) %>%
  slice_head(n = 5)
`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.
print(top5_cities)
# A tibble: 25 × 3
# Groups:   City [18]
   City                         Year mean_pm25
   <chr>                       <dbl>     <dbl>
 1 ""                           2002     23.9 
 2 "Azusa"                      2002     20.8 
 3 "Azusa"                      2022      9.72
 4 "Burbank"                    2002     24.0 
 5 "Compton"                    2022     13.0 
 6 "Glendora"                   2022      8.42
 7 "Lancaster-Division Street"  2002     10.4 
 8 "Lancaster-Division Street"  2022      7.52
 9 "Lebec"                      2002      4.82
10 "Lebec"                      2022      3.50
# ℹ 15 more rows

City-Level Results: At the city level, the dot plot shows a general decrease in mean PM2.5 concentration levels across almost all monitoring sites in Los Angeles county from 2002 to 2022. Especially in densely populated cities such as Los Angeles, the city experienced a decline by almost half (~21 to ~11 PM2.5 by 2022), suggesting that pollution controls and air quality management efforts have been impactful in urban areas. Although, smaller cities also show improvements by 2022, though the decrease is less obvious in magnitude. Overall, the dot plot highlights both a general downward trend and some variation in the degree of improvement between cities, given differences in local emission sources, geography, and population density.